import matplotlib.pyplot as plt
from IPython.core import display
Can we scrape HN? https://news.ycombinator.com/item?id=1721105
posts_savefile = 'posts.csv'
tdm_savefile = 'posts_tdm.csv'
urls = (
(2011, 1, 'https://news.ycombinator.com/item?id=2057704'),
(2011, 2, 'https://news.ycombinator.com/item?id=2161360'),
(2011, 3, 'https://news.ycombinator.com/item?id=2270790'),
(2011, 4, 'https://news.ycombinator.com/item?id=2396027'),
(2011, 5, 'https://news.ycombinator.com/item?id=2503204'),
(2011, 6, 'https://news.ycombinator.com/item?id=2607052'),
(2011, 7, 'https://news.ycombinator.com/item?id=2719028'),
(2011, 8, 'https://news.ycombinator.com/item?id=2831646'),
(2011, 9, 'https://news.ycombinator.com/item?id=2949787'),
(2011, 10, 'https://news.ycombinator.com/item?id=3060221'),
(2011, 11, 'https://news.ycombinator.com/item?id=3181796'),
(2011, 12, 'https://news.ycombinator.com/item?id=3300290'),
(2012, 1, 'https://news.ycombinator.com/item?id=3412900'),
(2012, 2, 'https://news.ycombinator.com/item?id=3537881'),
(2012, 3, 'https://news.ycombinator.com/item?id=3652041'),
(2012, 4, 'https://news.ycombinator.com/item?id=3783657'),
(2012, 5, 'https://news.ycombinator.com/item?id=3913997'),
(2012, 6, 'https://news.ycombinator.com/item?id=4053076'),
(2012, 7, 'https://news.ycombinator.com/item?id=4184755'),
(2012, 8, 'https://news.ycombinator.com/item?id=4323597'),
(2012, 9, 'https://news.ycombinator.com/item?id=4463689'),
(2012, 10, 'https://news.ycombinator.com/item?id=4596375'),
(2012, 11, 'https://news.ycombinator.com/item?id=4727241'),
(2012, 12, 'https://news.ycombinator.com/item?id=4857714'),
(2013, 1, 'https://news.ycombinator.com/item?id=4992617'),
(2013, 2, 'https://news.ycombinator.com/item?id=5150834'),
(2013, 3, 'https://news.ycombinator.com/item?id=5304169'),
(2013, 4, 'https://news.ycombinator.com/item?id=5472746'),
(2013, 5, 'https://news.ycombinator.com/item?id=5637663'),
(2013, 6, 'https://news.ycombinator.com/item?id=5803764'),
(2013, 7, 'https://news.ycombinator.com/item?id=5970187'),
(2013, 8, 'https://news.ycombinator.com/item?id=6139927'),
(2013, 9, 'https://news.ycombinator.com/item?id=6310234'),
(2013, 10, 'https://news.ycombinator.com/item?id=6475879'),
(2013, 11, 'https://news.ycombinator.com/item?id=6653437'),
(2013, 12, 'https://news.ycombinator.com/item?id=6827554'),
(2014, 1, 'https://news.ycombinator.com/item?id=6995020'),
(2014, 2, 'https://news.ycombinator.com/item?id=7162197'),
(2014, 3, 'https://news.ycombinator.com/item?id=7324236'),
(2014, 4, 'https://news.ycombinator.com/item?id=7507765'),
(2014, 5, 'https://news.ycombinator.com/item?id=7679431')
)
def filename(year, month):
return 'html/hn_%d_%d.html' % (year, month)
# maybe drop urls into a DataFrame to save to CSV?
import pandas as pd
urlsdf = pd.DataFrame(list(urls), columns=['year', 'month', 'url'])
urlsdf.head(3)
year | month | url | |
---|---|---|---|
0 | 2011 | 1 | https://news.ycombinator.com/item?id=2057704 |
1 | 2011 | 2 | https://news.ycombinator.com/item?id=2161360 |
2 | 2011 | 3 | https://news.ycombinator.com/item?id=2270790 |
from bs4 import BeautifulSoup
import collections
import os.path
import requests
import time
stack = collections.deque(urls)
tries = len(stack) * 3 # maximum attempts 3 times of number of URLs
while tries > 0:
tries -= 1
current = stack.pop()
year, month, url = current
# local html output file
fname = filename(year, month)
if os.path.isfile(fname):
os.remove(fname)
try:
# get the HN pages for month / year
ym_pages = [url]
while ym_pages:
url = ym_pages.pop()
print "Fetching URL: %s" % (url)
r = requests.get(url)
# fail if bad error code
if r.status_code != requests.codes.ok:
raise Exception('Error from server: ' + str(r.status_code))
text = r.text.replace('&', '_') # broken HTML escapes breaking BeautifulSoup, removing
# write out to file in cwd
with open(fname, 'a') as htmlfile:
htmlfile.write(text.encode('utf-8'))
# check for 'More' link
soup = BeautifulSoup(text)
links = soup.find_all('a', text='More')
if links:
# sometimes foward slash is being html escaped and messed
# up by above & replacment, need to replace again
link_url = 'https://news.ycombinator.com' + links[0]['href'].replace('_#x2F;', '/')
ym_pages.append(link_url)
# take a break for 30 seconds
time.sleep(30)
except Exception as e:
print 'error:', e, 'currently on:', current
# stick current URL at the begining of the queue
stack.appendleft(current)
# get out when stack is empty
if not stack: break
Parsing of the HTML from Hacker News from HTML to {user, post} dicts
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
def html_to_posts(html):
"""Parse an html document into posts"""
posts = []
html = html.replace("<br>", "<br/>") # unclosed <br>'s are messing up BeautifulSoup
soup = BeautifulSoup(html)
tables = soup.body.center.table('tr', recursive=False)[2].td('table', recursive=False)
comments_table = tables[1] if len(tables) > 1 else tables[0]
rows = comments_table('tr', recursive=False)
for row in rows:
# check if td and table exist
if not row.td or not row.td.table: continue
# check if this is a top level comment
spacer_img = row.td.table.tr.td.img
if not spacer_img['width'] == '0': continue
comment_tag = row.find_all('span', class_='comment')[0]
comment = comment_tag.get_text(separator=' ')
#print comment[:30]
if comment == '[deleted]' or comment == '[dead]': continue
head_tag = row('span', class_='comhead')[0]
user = head_tag.a.text
posts.append({'user': user, 'post': comment.encode('utf-8')})
return posts
# urls = [(2012, 7, 'https://news.ycombinator.com/item?id=4184755')]
posts = []
for current in urls:
# print current
year, month, url = current
all_html = open(filename(year, month)).read()
start_html = 0
while start_html < len(all_html):
end_html = all_html.find('</html>', start_html)
html = all_html[start_html:end_html + 7]
start_html = end_html + 7
ym_posts = html_to_posts(html)
#print "Found %d posts" % len(ym_posts)
for post in ym_posts:
post.update({'date': pd.datetime(year, month, 1)})
posts.append(post)
postsdf = pd.DataFrame(posts, columns=['date', 'user', 'post'])
postsdf.to_csv(posts_savefile, index=False)
postsdf = pd.read_csv(posts_savefile, parse_dates=[0])
postsdf.head(3)
date | user | post | |
---|---|---|---|
0 | 2011-01-01 00:00:00 | lkrubner | In New York City there are a lot of jobs. I we... |
1 | 2011-01-01 00:00:00 | jasonfried | 37signals is hiring two Rails programmers:\n h... |
2 | 2011-01-01 00:00:00 | tptacek | Chicago (or remote) Matasano Security LEAD SOF... |
postsdf.tail(3)
date | user | post | |
---|---|---|---|
9938 | 2014-05-01 00:00:00 | jasonlotito | MeetMe - New Hope, PA (near Philadelphia, Penn... |
9939 | 2014-05-01 00:00:00 | ssharpe67 | Datalex - Atlanta, GA\nReady to use your tech ... |
9940 | 2014-05-01 00:00:00 | findwork | Disclaimer: Forgive me for posting here. I jus... |
# add year and month columsn to dataframe
postsdf['year'] = [v.year for v in postsdf.date]
postsdf['month'] = [v.month for v in postsdf.date]
# add count
ymdf = pd.DataFrame({'count': postsdf.groupby(['date']).size()})
ymdf = ymdf.reset_index()
ymdf['year'] = [v.year for v in ymdf.date]
ymdf['month'] = [v.month for v in ymdf.date]
# display a table of counts per month per year
ymdf[['year', 'month', 'count']].pivot(index='year', columns='month', values='count')
month | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
year | ||||||||||||
2011 | 88 | 150 | 27 | 218 | 217 | 257 | 224 | 230 | 191 | 198 | 230 | 203 |
2012 | 149 | 201 | 251 | 201 | 231 | 227 | 194 | 245 | 214 | 248 | 221 | 230 |
2013 | 192 | 219 | 291 | 343 | 323 | 263 | 292 | 309 | 239 | 426 | 298 | 263 |
2014 | 223 | 330 | 340 | 356 | 389 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# get unique years in the DataFrame
years = postsdf['year'].unique()
# start a wide matplotlib figure
fig = plt.figure(figsize=(15, 3))
# plot all the data
ax = fig.add_subplot(121)
ymdf[['date', 'count']].set_index('date').plot(ax=ax)
ax.legend(loc=4)
ax.set_title("Number of Posts Each Month Since January 2011")
# plot data split out by year
ax = fig.add_subplot(122)
df = ymdf[['count', 'year', 'month']].pivot('month', 'year')
# display(df)
df.plot(ax=ax)
ax.legend(loc=4)
ax.set_title("Split Out Per Year")
plt.show()
# postsdf['weekday'] = [d.strftime('%a') for d in postsdf['date']]
postsdf['weekday'] = [d.weekday() for d in postsdf['date']]
mar_2011 = datetime.date(2011, 3, 1)
after_mar_2011 = postsdf[postsdf['date'] > mar_2011]
posts_date_day = after_mar_2011[['date', 'weekday']]
grouped = posts_date_day.groupby(['date', 'weekday'])
# alltextdf = pd.DataFrame({'post_count': grouped.size(), 'alltext': grouped['post'].apply(merge)})
# byweekday = pd.DataFrame({'weekday': grouped['weekday']})
# len(grouped.groups.keys())
a = pd.DataFrame(grouped.size())
a = a.reset_index()
b = a.groupby('weekday')
c = b.mean()
#c
stopwords = open('stopwords').readlines()
stopwords = [w.strip() for w in stopwords]
postsdf = pd.read_csv(posts_savefile, parse_dates=[0])#[:2500]
postsdf.tail(3)
date | user | post | |
---|---|---|---|
9938 | 2014-05-01 00:00:00 | jasonlotito | MeetMe - New Hope, PA (near Philadelphia, Penn... |
9939 | 2014-05-01 00:00:00 | ssharpe67 | Datalex - Atlanta, GA\nReady to use your tech ... |
9940 | 2014-05-01 00:00:00 | findwork | Disclaimer: Forgive me for posting here. I jus... |
import re
postsdf2 = postsdf.drop('user', axis=1)
def merge(v):
return ' '.join(v)
def words_in_post(post):
post = re.sub(r'[\. |, |\-|/|\(|\)|;|\[|\]|:|!|"|?|=|_|0-9]', ' ', post)
words = post.lower().split()
words = [word for word in words if word and word not in stopwords]
word_counts = {}
for word in words:
word_counts[word] = word_counts.get(word, 0) + 1
return word_counts
grouped = postsdf2.groupby(['date'])
alltextdf = pd.DataFrame({'post_count': grouped.size(), 'alltext': grouped['post'].apply(merge)})
#postsdf['year'] = [v.year for v in postsdf.date]
#postsdf['month'] = [v.month for v in postsdf.date]
alltextdf = alltextdf.reset_index()
# loop over month/years and extract words for each combo
tdm_df = None
for i in range(len(alltextdf)):
words = words_in_post(alltextdf['alltext'][i])
date = alltextdf['date'][i]
year = date.year
month =date.month
post_count = alltextdf['post_count'][i]
word_count = len(words.keys())
df = pd.DataFrame([(date, year, month, k, words[k], post_count, word_count) for k in words],
columns=['date', 'year', 'month', 'term', 'count', 'post_count', 'word_count'])
if type(tdm_df) != pd.DataFrame :
tdm_df = df
else:
tdm_df = pd.concat([tdm_df, df])
tdm_df['prop'] = 1.0 * tdm_df['count'] / tdm_df['post_count']
tdm_df.to_csv(tdm_savefile, index=False)
tdm_df = pd.read_csv(tdm_savefile, parse_dates=[0])
tdm_df.head(2)
#display(tdm_df.tail(2))
date | year | month | term | count | post_count | word_count | prop | |
---|---|---|---|---|---|---|---|---|
0 | 2011-01-01 00:00:00 | 2011 | 1 | secondly | 1 | 88 | 1745 | 0.011364 |
1 | 2011-01-01 00:00:00 | 2011 | 1 | sbnation | 2 | 88 | 1745 | 0.022727 |
import itertools
linecycler = itertools.cycle(['-', '--', ':'])
fig = plt.figure(figsize=(15, 8))
ax = fig.add_subplot(111)
terms = sorted(['java', 'php', 'python', 'rails', 'django', 'hadoop', 'ember', 'angularjs', 'meteor', 'javascript'])
# pull out only the terms we care about
df = tdm_df[tdm_df.term.isin(terms)][['date', 'term', 'prop']]
for p in terms: #df.columns[1:2]:
subdf = df[df['term'] == p][['date', 'prop']]
subdf = subdf.set_index(['date'])
ax.plot(subdf.index, subdf.values, linestyle=next(linecycler), label=p, linewidth=9)
plt.legend(loc=2)
plt.show()